package com.asiainfo.zqx;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.select.Elements;
public class DiYicajingwang extends BreadthCrawler {
    private String seedurl = "https://www.yicai.com/";
    private String regurl = "https://www.yicai.com/news/.*.html";

    public DiYicajingwang(String crawlPath) {
        super(crawlPath, false);
        addSeed(seedurl);
        setThreads(1);
        //  setResumable(true);
    }
    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contenType = page.contentType();
        if (contenType == null) {
            return;
        }
        if (page.matchType("text")) {
            System.out.println("连接为" + page.url());
            System.out.println("标题为" + page.select("div.title.f-pr>h1").text());
            System.out.println("来源为第一财经");
            String time = page.select("div.title.f-pr>p.f-cb>em.f-fl").text();
            System.out.println("时间为" + time);
            String writer = page.select("p.names").text();
            if (writer.contains("作者")) {
                writer = writer.substring(writer.indexOf("作者") + 2, writer.indexOf("责编"));
            } else {
                writer = "第一财经";
            }
            System.out.println("作者为" + writer);
            System.out.println("正文为" + page.select("div#multi-text>p").text());

        } else {
            Elements elements = page.select("a");
            for (int i = 0; i < elements.size(); i++) {
                if (elements.get(i).attr("abs:href").matches(regurl)) {
                    crawlDatums.add(new CrawlDatum(elements.get(i).attr("abs:href"), "text"));
                }
            }
        }
    }
    public static void main(String[] args) throws Exception {
        DiYicajingwang diyi = new DiYicajingwang("diyi");
        diyi.start(2);
    }
}
